In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlalchemy
import pandasql as ps
from pandasql import sqldf 
mysql = lambda q: sqldf(q, globals())
In [2]:
dim_match_summary = pd.read_csv('dim_match_summary.csv')
print(dim_match_summary.info())
dim_match_summary.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   team1      45 non-null     object
 1   team2      45 non-null     object
 2   winner     45 non-null     object
 3   margin     41 non-null     object
 4   ground     45 non-null     object
 5   matchDate  45 non-null     object
 6   match_id   45 non-null     object
dtypes: object(7)
memory usage: 2.6+ KB
None
Out[2]:
team1 team2 winner margin ground matchDate match_id
0 Namibia Sri Lanka Namibia 55 runs Geelong 16-Oct-22 T20I # 1823
1 Netherlands U.A.E. Netherlands 3 wickets Geelong 16-Oct-22 T20I # 1825
2 Scotland West Indies Scotland 42 runs Hobart 17-Oct-22 T20I # 1826
3 Ireland Zimbabwe Zimbabwe 31 runs Hobart 17-Oct-22 T20I # 1828
4 Namibia Netherlands Netherlands 5 wickets Geelong 18-Oct-22 T20I # 1830
In [3]:
#Converting the matchdate column into Date format 
dim_match_summary['matchDate'] = pd.to_datetime(dim_match_summary['matchDate'], format='%d-%b-%y')

#Removing T20I# from the matchId
dim_match_summary['matchId'] = dim_match_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')

#Replacing 'no result' and 'abandoned' with 'no result' in the winner column
dim_match_summary.loc[dim_match_summary['winner'].isin(['no result', 'abandoned']), 'winner'] = 'no result'

#Removing extra spaces and converting into lower
dim_match_summary['team1'] =dim_match_summary['team1'].astype(str).str.strip().str.lower()
dim_match_summary['team2'] =dim_match_summary['team2'].astype(str).str.strip().str.lower()
dim_match_summary['winner'] =dim_match_summary['winner'].astype(str).str.strip().str.lower()
dim_match_summary['ground'] =dim_match_summary['ground'].astype(str).str.strip().str.lower()


# Converting  teamInnings and batsmanName to Proper case
dim_match_summary['team1'] =dim_match_summary['team1'].str.title()
dim_match_summary['team2'] =dim_match_summary['team2'].str.title()
dim_match_summary['winner'] =dim_match_summary['winner'].str.title()
dim_match_summary['ground'] =dim_match_summary['ground'].str.title()


# Set margin to null for no result  matches
dim_match_summary.loc[dim_match_summary['winner'] == 'no result', 'margin'] = None

#Checking consistency
same_team_check = dim_match_summary[dim_match_summary['team1'] == dim_match_summary['team2']]

# Display rows where team1 and team2 are the same
if same_team_check.empty:
    print("All rows have different teams for 'team1' and 'team2'.")
else:
    print("There are rows where 'team1' and 'team2' are the same:")
    print(same_team_check)

print(dim_match_summary.duplicated().sum())
dim_match_summary.drop(columns='match_id',inplace=True)
dim_match_summary.to_csv('match_summary.csv', index=False)
dim_match_summary
All rows have different teams for 'team1' and 'team2'.
0
Out[3]:
team1 team2 winner margin ground matchDate matchId
0 Namibia Sri Lanka Namibia 55 runs Geelong 2022-10-16 1823
1 Netherlands U.A.E. Netherlands 3 wickets Geelong 2022-10-16 1825
2 Scotland West Indies Scotland 42 runs Hobart 2022-10-17 1826
3 Ireland Zimbabwe Zimbabwe 31 runs Hobart 2022-10-17 1828
4 Namibia Netherlands Netherlands 5 wickets Geelong 2022-10-18 1830
5 Sri Lanka U.A.E. Sri Lanka 79 runs Geelong 2022-10-18 1832
6 Ireland Scotland Ireland 6 wickets Hobart 2022-10-19 1833
7 West Indies Zimbabwe West Indies 31 runs Hobart 2022-10-19 1834
8 Netherlands Sri Lanka Sri Lanka 16 runs Geelong 2022-10-20 1835
9 Namibia U.A.E. U.A.E. 7 runs Geelong 2022-10-20 1836
10 Ireland West Indies Ireland 9 wickets Hobart 2022-10-21 1837
11 Scotland Zimbabwe Zimbabwe 5 wickets Hobart 2022-10-21 1838
12 Australia New Zealand New Zealand 89 runs Sydney 2022-10-22 1839
13 Afghanistan England England 5 wickets Perth 2022-10-22 1840
14 Ireland Sri Lanka Sri Lanka 9 wickets Hobart 2022-10-23 1841
15 India Pakistan India 4 wickets Melbourne 2022-10-23 1842
16 Bangladesh Netherlands Bangladesh 9 runs Hobart 2022-10-24 1843
17 South Africa Zimbabwe No Result NaN Hobart 2022-10-24 1844
18 Australia Sri Lanka Australia 7 wickets Perth 2022-10-25 1845
19 England Ireland Ireland 5 runs Melbourne 2022-10-26 1846
20 Afghanistan New Zealand No Result NaN Melbourne 2022-10-26 1846a
21 Bangladesh South Africa South Africa 104 runs Sydney 2022-10-27 1847
22 India Netherlands India 56 runs Sydney 2022-10-27 1848
23 Pakistan Zimbabwe Zimbabwe 1 run Perth 2022-10-27 1849
24 Afghanistan Ireland No Result NaN Melbourne 2022-10-28 1849a
25 Australia England No Result NaN Melbourne 2022-10-28 1849b
26 New Zealand Sri Lanka New Zealand 65 runs Sydney 2022-10-29 1850
27 Bangladesh Zimbabwe Bangladesh 3 runs Brisbane 2022-10-30 1851
28 Netherlands Pakistan Pakistan 6 wickets Perth 2022-10-30 1852
29 India South Africa South Africa 5 wickets Perth 2022-10-30 1853
30 Australia Ireland Australia 42 runs Brisbane 2022-10-31 1855
31 Afghanistan Sri Lanka Sri Lanka 6 wickets Brisbane 2022-11-01 1856
32 England New Zealand England 20 runs Brisbane 2022-11-01 1858
33 Netherlands Zimbabwe Netherlands 5 wickets Adelaide 2022-11-02 1859
34 Bangladesh India India 5 runs Adelaide 2022-11-02 1860
35 Pakistan South Africa Pakistan 33 runs Sydney 2022-11-03 1861
36 Ireland New Zealand New Zealand 35 runs Adelaide 2022-11-04 1862
37 Australia Afghanistan Australia 4 runs Adelaide 2022-11-04 1864
38 England Sri Lanka England 4 wickets Sydney 2022-11-05 1867
39 Netherlands South Africa Netherlands 13 runs Adelaide 2022-11-06 1871
40 Bangladesh Pakistan Pakistan 5 wickets Adelaide 2022-11-06 1872
41 India Zimbabwe India 71 runs Melbourne 2022-11-06 1873
42 New Zealand Pakistan Pakistan 7 wickets Sydney 2022-11-09 1877
43 England India England 10 wickets Adelaide 2022-11-10 1878
44 England Pakistan England 5 wickets Melbourne 2022-11-13 1879
In [4]:
dim_players = pd.read_csv('dim_players.csv')
print(dim_players.info())
dim_players.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          219 non-null    object
 1   team          219 non-null    object
 2   battingStyle  219 non-null    object
 3   bowlingStyle  199 non-null    object
 4   playingRole   219 non-null    object
dtypes: object(5)
memory usage: 8.7+ KB
None
Out[4]:
name team battingStyle bowlingStyle playingRole
0 Najmul Hossain Shanto Bangladesh Left hand Bat Right arm Offbreak Top order Batter
1 Soumya Sarkar Bangladesh Left hand Bat Right arm Medium fast Middle order Batter
2 Litton Das Bangladesh Right hand Bat NaN Wicketkeeper Batter
3 Shakib Al Hasan(c) Bangladesh Left hand Bat Slow Left arm Orthodox Allrounder
4 Afif Hossain Bangladesh Left hand Bat Right arm Offbreak Allrounder
In [5]:
 
import pandas as pd


dim_players = pd.read_csv('dim_players.csv')

# Filling NaN values with specified defaults
dim_players['bowlingStyle'].fillna('None', inplace=True)
dim_players['battingStyle'].fillna('None', inplace=True)
dim_players['playingRole'].fillna('Unknown', inplace=True)



# Converting  teamInnings and batsmanName to Proper case
dim_players['name'] = dim_players['name'].astype(str).str.strip().str.lower().str.title()
dim_players['playingRole'] = dim_players['playingRole'].str.strip().str.lower().str.title()
dim_players['team'] = dim_players['team'].str.strip().str.lower().str.title()
dim_players['battingStyle'] = dim_players['battingStyle'].str.strip().str.lower().str.title()
dim_players['bowlingStyle'] = dim_players['bowlingStyle'].str.strip().str.lower().str.title()


# Checking for duplicates in the DataFrame
print(f"Number of duplicate rows: {dim_players.duplicated().sum()}")


dim_players.to_csv('Infoplayers.csv', index=False)


dim_players.head(30)
Number of duplicate rows: 0
Out[5]:
name team battingStyle bowlingStyle playingRole
0 Najmul Hossain Shanto Bangladesh Left Hand Bat Right Arm Offbreak Top Order Batter
1 Soumya Sarkar Bangladesh Left Hand Bat Right Arm Medium Fast Middle Order Batter
2 Litton Das Bangladesh Right Hand Bat None Wicketkeeper Batter
3 Shakib Al Hasan(C) Bangladesh Left Hand Bat Slow Left Arm Orthodox Allrounder
4 Afif Hossain Bangladesh Left Hand Bat Right Arm Offbreak Allrounder
5 Mosaddek Hossain Bangladesh Right Hand Bat Right Arm Offbreak Middle Order Batter
6 Nurul Hasan Bangladesh Right Hand Bat None Wicketkeeper Batter
7 Yasir Ali Bangladesh Right Hand Bat Right Arm Offbreak Middle Order Batter
8 Wessly Madhevere Zimbabwe Right Hand Bat Right Arm Offbreak Allrounder
9 Craig Ervine(C) Zimbabwe Left Hand Bat Right Arm Offbreak Middle Order Batter
10 Milton Shumba Zimbabwe Left Hand Bat Slow Left Arm Orthodox Top Order Batter
11 Sean Williams Zimbabwe Left Hand Bat Slow Left Arm Orthodox Middle Order Batter
12 Sikandar Raza Zimbabwe Right Hand Bat Right Arm Offbreak Batting Allrounder
13 Regis Chakabva Zimbabwe Right Hand Bat Right Arm Offbreak Wicketkeeper Batter
14 Ryan Burl Zimbabwe Left Hand Bat Legbreak Middle Order Batter
15 Brad Evans Zimbabwe Right Hand Bat Right Arm Fast Allrounder
16 Richard Ngarava Zimbabwe Left Hand Bat Left Arm Fast Medium Bowler
17 Blessing Muzarabani Zimbabwe Right Hand Bat Right Arm Fast Medium Bowler
18 Tendai Chatara Zimbabwe Right Hand Bat Right Arm Fast Medium Bowler
19 Taskin Ahmed Bangladesh Left Hand Bat Right Arm Fast Bowler
20 Hasan Mahmud Bangladesh Right Hand Bat Right Arm Medium Bowler
21 Mustafizur Rahman Bangladesh Left Hand Bat Left Arm Fast Medium Bowler
22 Shakib Al Hasan Bangladesh Left Hand Bat Slow Left Arm Orthodox Allrounder
23 Nasum Ahmed Bangladesh Left Hand Bat Slow Left Arm Orthodox Bowler
24 Mohammad Rizwan Pakistan Right Hand Bat None Wicketkeeper Batter
25 Babar Azam(C) Pakistan Right Hand Bat Right Arm Offbreak Batter
26 Mohammad Nawaz Pakistan Left Hand Bat Slow Left Arm Orthodox Allrounder
27 Mohammad Haris Pakistan Right Hand Bat Right Arm Offbreak Middle Order Batter
28 Shan Masood Pakistan Left Hand Bat Right Arm Medium Fast Opening Batter
29 Iftikhar Ahmed Pakistan Right Hand Bat Right Arm Offbreak Middle Order Batter
In [6]:
fact_bowling_summary = pd.read_csv('fact_bowling_summary.csv')
print(fact_bowling_summary.info())
fact_bowling_summary.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   match        500 non-null    object 
 1   bowlingTeam  500 non-null    object 
 2   bowlerName   500 non-null    object 
 3   overs        500 non-null    float64
 4   maiden       500 non-null    int64  
 5   runs         500 non-null    int64  
 6   wickets      500 non-null    int64  
 7   economy      500 non-null    float64
 8   0s           500 non-null    int64  
 9   4s           500 non-null    int64  
 10  6s           500 non-null    int64  
 11  wides        500 non-null    int64  
 12  noBalls      500 non-null    int64  
 13  match_id     500 non-null    object 
dtypes: float64(2), int64(8), object(4)
memory usage: 54.8+ KB
None
Out[6]:
match bowlingTeam bowlerName overs maiden runs wickets economy 0s 4s 6s wides noBalls match_id
0 Namibia Vs Sri Lanka Sri Lanka Maheesh Theekshana 4.0 0 23 1 5.75 7 0 0 2 0 T20I # 1823
1 Namibia Vs Sri Lanka Sri Lanka Dushmantha Chameera 4.0 0 39 1 9.75 6 3 1 2 0 T20I # 1823
2 Namibia Vs Sri Lanka Sri Lanka Pramod Madushan 4.0 0 37 2 9.25 6 3 1 0 0 T20I # 1823
3 Namibia Vs Sri Lanka Sri Lanka Chamika Karunaratne 4.0 0 36 1 9.00 7 3 1 1 0 T20I # 1823
4 Namibia Vs Sri Lanka Sri Lanka Wanindu Hasaranga de Silva 4.0 0 27 1 6.75 8 1 1 0 0 T20I # 1823
In [7]:
#Removing T20I# from the matchId
fact_bowling_summary['matchId'] = fact_bowling_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')

#Calculating Overall Extras Conceded
fact_bowling_summary['extras'] = fact_bowling_summary['wides'] + fact_bowling_summary['noBalls']

#Removing extra spaces and converting into lower
fact_bowling_summary['bowlingTeam'] = fact_bowling_summary['bowlingTeam'].astype(str).str.strip().str.lower()
fact_bowling_summary['bowlerName'] = fact_bowling_summary['bowlerName'].astype(str).str.strip().str.lower()

#Converting  teamInnings and batsmanName to Proper case
fact_bowling_summary['bowlingTeam'] = fact_bowling_summary['bowlingTeam'].str.title()
fact_bowling_summary['bowlerName'] =  fact_bowling_summary['bowlerName'].str.title()


#Detecting outliers in the economy
outliers = fact_bowling_summary[
    (fact_bowling_summary['economy'] > 36) |
    (fact_bowling_summary['overs'] > 4.0) |
    (fact_bowling_summary['wickets'] > 10) |
    (fact_bowling_summary['maiden'] > 4)
]

if outliers.empty:
    print("No Oultiers")
else:
    print("Outliers Detected:\n", outliers)
q="""SELECT bowlerName,runs,economy,
    (runs / economy) AS actual_overs_bowled
FROM
    outliers;
"""
actual_overs=mysql(q)
actual_overs
Outliers Detected:
                     match  bowlingTeam   bowlerName  overs  maiden  runs  \
12  U.A.E. Vs Netherlands  Netherlands  Tim Pringle    5.0       0    13   

    wickets  economy  0s  4s  6s  wides  noBalls       match_id matchId  \
12        1     3.25  11   0   0      0        0  T20I # 1825      1825   

    extras  
12       0  
Out[7]:
bowlerName runs economy actual_overs_bowled
0 Tim Pringle 13 3.25 4.0
In [8]:
fact_bowling_summary.loc[
    (fact_bowling_summary['matchId'] == '1825') & 
    (fact_bowling_summary['bowlerName'] == 'Tim Pringle'), 
    'overs'
] =actual_overs['actual_overs_bowled'][0]
updated_row = fact_bowling_summary[
    (fact_bowling_summary['matchId'] == '1825') & 
    (fact_bowling_summary['bowlerName'] == 'Tim Pringle')
]
updated_row
Out[8]:
match bowlingTeam bowlerName overs maiden runs wickets economy 0s 4s 6s wides noBalls match_id matchId extras
12 U.A.E. Vs Netherlands Netherlands Tim Pringle 4.0 0 13 1 3.25 11 0 0 0 0 T20I # 1825 1825 0
In [9]:
#Calculating Number of balls bowled
fact_bowling_summary['total_balls_bowled'] = (fact_bowling_summary['overs'].astype(int) * 6) + ((fact_bowling_summary['overs'] % 1) * 10).astype(int)

print(f"Number of duplicate rows: {fact_bowling_summary.duplicated().sum()}")
fact_bowling_summary.drop(columns='match_id',inplace=True)
fact_bowling_summary.to_csv('bowling_summary.csv', index=False)

fact_bowling_summary
Number of duplicate rows: 0
Out[9]:
match bowlingTeam bowlerName overs maiden runs wickets economy 0s 4s 6s wides noBalls matchId extras total_balls_bowled
0 Namibia Vs Sri Lanka Sri Lanka Maheesh Theekshana 4.0 0 23 1 5.75 7 0 0 2 0 1823 2 24
1 Namibia Vs Sri Lanka Sri Lanka Dushmantha Chameera 4.0 0 39 1 9.75 6 3 1 2 0 1823 2 24
2 Namibia Vs Sri Lanka Sri Lanka Pramod Madushan 4.0 0 37 2 9.25 6 3 1 0 0 1823 0 24
3 Namibia Vs Sri Lanka Sri Lanka Chamika Karunaratne 4.0 0 36 1 9.00 7 3 1 1 0 1823 1 24
4 Namibia Vs Sri Lanka Sri Lanka Wanindu Hasaranga De Silva 4.0 0 27 1 6.75 8 1 1 0 0 1823 0 24
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
495 Pakistan Vs England Pakistan Naseem Shah 4.0 0 30 0 7.50 15 3 1 1 0 1879 1 24
496 Pakistan Vs England Pakistan Haris Rauf 4.0 0 23 2 5.75 13 3 0 1 0 1879 1 24
497 Pakistan Vs England Pakistan Shadab Khan 4.0 0 20 1 5.00 10 1 0 0 0 1879 0 24
498 Pakistan Vs England Pakistan Mohammad Wasim 4.0 0 38 1 9.50 5 5 0 2 0 1879 2 24
499 Pakistan Vs England Pakistan Iftikhar Ahmed 0.5 0 13 0 15.60 0 1 1 0 0 1879 0 5

500 rows × 16 columns

In [10]:
fact_batting_summary = pd.read_csv('fact_bating_summary.csv')
print(fact_batting_summary.info())
fact_batting_summary.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   match        699 non-null    object
 1   teamInnings  699 non-null    object
 2   battingPos   699 non-null    int64 
 3   batsmanName  699 non-null    object
 4   runs         699 non-null    int64 
 5   balls        699 non-null    int64 
 6   4s           699 non-null    int64 
 7   6s           699 non-null    int64 
 8   SR           699 non-null    object
 9   out/not_out  699 non-null    object
 10  match_id     699 non-null    object
dtypes: int64(5), object(6)
memory usage: 60.2+ KB
None
Out[10]:
match teamInnings battingPos batsmanName runs balls 4s 6s SR out/not_out match_id
0 Namibia Vs Sri Lanka Namibia 1 Michael van Lingen 3 6 0 0 50 out T20I # 1823
1 Namibia Vs Sri Lanka Namibia 2 Divan la Cock 9 9 1 0 100 out T20I # 1823
2 Namibia Vs Sri Lanka Namibia 3 Jan Nicol Loftie-Eaton 20 12 1 2 166.66 out T20I # 1823
3 Namibia Vs Sri Lanka Namibia 4 Stephan Baard 26 24 2 0 108.33 out T20I # 1823
4 Namibia Vs Sri Lanka Namibia 5 Gerhard Erasmus(c) 20 24 0 0 83.33 out T20I # 1823
In [11]:
#Removing T20I# from the matchId
fact_batting_summary['matchId'] = fact_batting_summary['match_id'].str.extract(r'#\s(\d+[a-zA-Z]*)')

#Converting SR to Float Type
fact_batting_summary['SR'] = fact_batting_summary['SR'].replace({'-': np.nan}).fillna(0.00).astype(float).round(2)


#Removing extra spaces and converting into lower
fact_batting_summary['teamInnings'] = fact_batting_summary['teamInnings'].astype(str).str.strip().str.lower()
fact_batting_summary['batsmanName'] = fact_batting_summary['batsmanName'].astype(str).str.strip().str.lower()

# Converting  teamInnings and batsmanName to Proper case
fact_batting_summary['teamInnings'] = fact_batting_summary['teamInnings'].str.title()
fact_batting_summary['batsmanName'] = fact_batting_summary['batsmanName'].str.title()


# Converting out/not_out to binary
print(fact_batting_summary['out/not_out'].unique())
fact_batting_summary['out/not_out'] = fact_batting_summary['out/not_out'].astype(str).str.strip().str.lower()
fact_batting_summary['out/not_out'] = fact_batting_summary['out/not_out'].map({'out': 1, 'not_out': 0})


# Checking Outliers
normal_outliers = fact_batting_summary[
    (fact_batting_summary['runs'] < 0) |
    (fact_batting_summary['balls'] < 0) |
    (fact_batting_summary['4s'] < 0) |(fact_batting_summary['4s'] > fact_batting_summary['balls']) |
    (fact_batting_summary['6s'] < 0) |(fact_batting_summary['6s'] > fact_batting_summary['balls']) |
    (fact_batting_summary['SR'] < 0)
]
if normal_outliers.empty:
    print("No  Normal Oultiers")
else:
    print("Outliers Detected:\n",normal_outliers)


#Checking SR outliers
fact_batting_summary['calculated_SR'] = np.where(
    fact_batting_summary['balls'] > 0,
    (fact_batting_summary['runs'] / fact_batting_summary['balls']) * 100,
    0
)
strike_rate_outlier = fact_batting_summary[
    (fact_batting_summary['balls'] > 0) &  
    (abs(fact_batting_summary['SR'] - fact_batting_summary['calculated_SR']) > 1)
]
print(strike_rate_outlier)
q="""SELECT *,
    ROUND((SR * balls) / 100)  AS calculated_runs
FROM strike_rate_outlier;
"""
SR_outlier=mysql(q)
SR_outlier
['out' 'not_out' 'Out' 'not_Out' 'out  ' 'not_out  ']
No  Normal Oultiers
                      match teamInnings  battingPos       batsmanName  runs  \
263       Pakistan Vs India       India           5        Axar Patel    62   
407  Bangladesh Vs Zimbabwe    Zimbabwe           1  Wessly Madhevere    74   

     balls  4s  6s      SR  out/not_out     match_id matchId  calculated_SR  
263      3   0   0   66.66            1  T20I # 1842    1842    2066.666667  
407      3   1   0  133.33            1  T20I # 1851    1851    2466.666667  
Out[11]:
match teamInnings battingPos batsmanName runs balls 4s 6s SR out/not_out match_id matchId calculated_SR calculated_runs
0 Pakistan Vs India India 5 Axar Patel 62 3 0 0 66.66 1 T20I # 1842 1842 2066.666667 2.0
1 Bangladesh Vs Zimbabwe Zimbabwe 1 Wessly Madhevere 74 3 1 0 133.33 1 T20I # 1851 1851 2466.666667 4.0
In [12]:
fact_batting_summary.drop(columns=['match_id','calculated_SR'],inplace=True)
for index, row in SR_outlier.iterrows():
    fact_batting_summary.loc[
        (fact_batting_summary['matchId'] == row['matchId']) & 
        (fact_batting_summary['teamInnings'] == row['teamInnings']) & 
        (fact_batting_summary['battingPos'] == row['battingPos']) & 
        (fact_batting_summary['batsmanName'] == row['batsmanName']),
        'runs'
    ] = row['calculated_runs']

# Verify the updated rows
updated_rows = fact_batting_summary[
    (fact_batting_summary['matchId'].isin(SR_outlier['matchId'])) & 
    (fact_batting_summary['batsmanName'].isin(SR_outlier['batsmanName']))
]

updated_rows
Out[12]:
match teamInnings battingPos batsmanName runs balls 4s 6s SR out/not_out matchId
263 Pakistan Vs India India 5 Axar Patel 2 3 0 0 66.66 1 1842
407 Bangladesh Vs Zimbabwe Zimbabwe 1 Wessly Madhevere 4 3 1 0 133.33 1 1851
In [13]:
fact_batting_summary.to_csv('batting_summary.csv', index=False)
fact_batting_summary
Out[13]:
match teamInnings battingPos batsmanName runs balls 4s 6s SR out/not_out matchId
0 Namibia Vs Sri Lanka Namibia 1 Michael Van Lingen 3 6 0 0 50.00 1 1823
1 Namibia Vs Sri Lanka Namibia 2 Divan La Cock 9 9 1 0 100.00 1 1823
2 Namibia Vs Sri Lanka Namibia 3 Jan Nicol Loftie-Eaton 20 12 1 2 166.66 1 1823
3 Namibia Vs Sri Lanka Namibia 4 Stephan Baard 26 24 2 0 108.33 1 1823
4 Namibia Vs Sri Lanka Namibia 5 Gerhard Erasmus(C) 20 24 0 0 83.33 1 1823
... ... ... ... ... ... ... ... ... ... ... ...
694 Pakistan Vs England England 3 Phil Salt 10 9 2 0 111.11 1 1879
695 Pakistan Vs England England 4 Ben Stokes 52 49 5 1 106.12 0 1879
696 Pakistan Vs England England 5 Harry Brook 20 23 1 0 86.95 1 1879
697 Pakistan Vs England England 6 Moeen Ali 19 13 3 0 146.15 1 1879
698 Pakistan Vs England England 7 Liam Livingstone 1 1 0 0 100.00 0 1879

699 rows × 11 columns

In [14]:
print(fact_batting_summary['out/not_out'].unique())
[1 0]

KPI's Identification¶

In [15]:
from sqlalchemy import create_engine

from urllib.parse import quote


password = quote('Ullekh@123')
DATABASE_URL = f'mysql+pymysql://root:{password}@127.0.0.1/T20WorldCup'

engine = create_engine(DATABASE_URL)

matchdf = pd.read_sql_table('match_summary', engine)


matchdf['margin'] =matchdf['margin'].astype(str).str.strip().str.lower()
matchdf['margin'] =matchdf['margin'].str.title()
matchdf.head()
Out[15]:
team1 team2 winner margin ground matchDate matchId
0 Namibia Sri Lanka Namibia 55 Runs Geelong 2022-10-16 1823
1 Netherlands U.A.E. Netherlands 3 Wickets Geelong 2022-10-16 1825
2 Scotland West Indies Scotland 42 Runs Hobart 2022-10-17 1826
3 Ireland Zimbabwe Zimbabwe 31 Runs Hobart 2022-10-17 1828
4 Namibia Netherlands Netherlands 5 Wickets Geelong 2022-10-18 1830
In [16]:
battingdf=pd.read_sql_table('batting_summary',engine)
battingdf.head()
Out[16]:
match teamInnings battingPos batsmanName runs balls 4s 6s SR out/not_out matchId playerId
0 Namibia Vs Sri Lanka Namibia 1 Michael Van Lingen 3 6 0 0 50.00 1 1823 128
1 Namibia Vs Sri Lanka Namibia 2 Divan La Cock 9 9 1 0 100.00 1 1823 129
2 Namibia Vs Sri Lanka Namibia 3 Jan Nicol Loftie-Eaton 20 12 1 2 166.66 1 1823 131
3 Namibia Vs Sri Lanka Namibia 4 Stephan Baard 26 24 2 0 108.33 1 1823 130
4 Namibia Vs Sri Lanka Namibia 5 Gerhard Erasmus 20 24 0 0 83.33 1 1823 133
In [17]:
bowlingdf=pd.read_sql_table('bowling_summary',engine)
bowlingdf.head()
Out[17]:
match bowlingTeam bowlerName overs maiden runs wickets economy 0s 4s 6s wides noBalls matchId extras total_balls_bowled playerId
0 Namibia Vs Sri Lanka Sri Lanka Maheesh Theekshana 4.0 0 23 1 5.75 7 0 0 2 0 1823 2 24 73
1 Namibia Vs Sri Lanka Sri Lanka Dushmantha Chameera 4.0 0 39 1 9.75 6 3 1 2 0 1823 2 24 192
2 Namibia Vs Sri Lanka Sri Lanka Pramod Madushan 4.0 0 37 2 9.25 6 3 1 0 0 1823 0 24 71
3 Namibia Vs Sri Lanka Sri Lanka Chamika Karunaratne 4.0 0 36 1 9.00 7 3 1 1 0 1823 1 24 164
4 Namibia Vs Sri Lanka Sri Lanka Wanindu Hasaranga De Silva 4.0 0 27 1 6.75 8 1 1 0 0 1823 0 24 74
In [18]:
playersdf=pd.read_sql_table('infoplayers',engine)

playersdf.head()
Out[18]:
name team battingStyle bowlingStyle playingRole playerId
0 Najmul Hossain Shanto Bangladesh Left Hand Bat Right Arm Offbreak Top Order Batter 1
1 Soumya Sarkar Bangladesh Left Hand Bat Right Arm Medium Fast Middle Order Batter 2
2 Litton Das Bangladesh Right Hand Bat None Wicketkeeper Batter 3
3 Shakib Al Hasan Bangladesh Left Hand Bat Slow Left Arm Orthodox Allrounder 4
4 Afif Hossain Bangladesh Left Hand Bat Right Arm Offbreak Allrounder 5

Overall Tournament KPIs¶

In [19]:
import pandas as pd

queries = {
    "Total Matches": """
        SELECT COUNT(DISTINCT matchId) as Total_Matches 
        FROM matchdf;
    """,
    "Total Runs Scored": """
        SELECT SUM(runs) AS total_runs_scored
        FROM battingdf;
    """,
    "Total Wickets Taken": """
        SELECT SUM(wickets) AS total_wickets_taken
        FROM bowlingdf;
    """,
    "Total Boundaries": """
        SELECT SUM("4s") + SUM("6s") AS Total_boundaries
        FROM battingdf;
    """,
    "Average Runs Per Innings": """
        SELECT ROUND(AVG(total_runs), 2) AS avg_runs_per_innings
        FROM (
            SELECT matchId, teamInnings, SUM(runs) AS total_runs
            FROM battingdf
            GROUP BY matchId, teamInnings
        ) AS innings_runs;
    """,
    "Highest Run Scorer": """
        SELECT batsmanName,
               SUM(runs) AS total_runs
        FROM battingdf
        GROUP BY batsmanName
        ORDER BY total_runs DESC
        LIMIT 1;
    """,
    "Highest Wicket Taker": """
        SELECT bowlerName,
               SUM(wickets) AS total_wickets
        FROM bowlingdf
        GROUP BY bowlerName
        ORDER BY total_wickets DESC
        LIMIT 1;
    """,
    "Highest Individual Score": """
        SELECT batsmanName, MAX(runs) AS highest_individual_score, matchId
        FROM battingdf
        GROUP BY batsmanName, matchId
        ORDER BY highest_individual_score DESC
        LIMIT 1;
    """,
    "Highest Individual Wickets": """
        SELECT bowlerName, MAX(wickets) AS highest_individual_wickets, runs AS runs_given, matchId
        FROM bowlingdf
        GROUP BY bowlerName, runs, matchId
        ORDER BY highest_individual_wickets DESC
        LIMIT 1;
    """
}

results = {}
for kpi_name, query in queries.items():
    df = mysql(query)
    if kpi_name in ["Highest Run Scorer", "Highest Wicket Taker", "Highest Individual Score", "Highest Individual Wickets"]:
        if kpi_name == "Highest Run Scorer":
            results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['total_runs']})"
        elif kpi_name == "Highest Wicket Taker":
            results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['total_wickets']})"
        elif kpi_name == "Highest Individual Score":
            results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['highest_individual_score']})"
        elif kpi_name == "Highest Individual Wickets":
            results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['highest_individual_wickets']})"
    else:
        results[kpi_name] = df.iloc[0, 0]

kpi_df = pd.DataFrame(list(results.items()), columns=["KPI", "Value"])

kpi_df
Out[19]:
KPI Value
0 Total Matches 45
1 Total Runs Scored 11169
2 Total Wickets Taken 515
3 Total Boundaries 1240
4 Average Runs Per Innings 132.96
5 Highest Run Scorer Virat Kohli (296)
6 Highest Wicket Taker Wanindu Hasaranga De Silva (15)
7 Highest Individual Score Rilee Rossouw (109)
8 Highest Individual Wickets Sam Curran (5)

Team KPI's¶

In [20]:
import pandas as pd

team_name = 'India'

queries = {
    "Total Matches": f"""
        SELECT COUNT(DISTINCT matchId) AS Total_Matches 
        FROM matchdf
        WHERE team1 = '{team_name}' OR team2='{team_name}';
    """,
    "Win/Loss Ratio": f"""
        SELECT 
        ROUND(SUM(wins) * 1.0 / SUM(total_matches) * 100, 2) AS win_loss_ratio
        FROM (
            SELECT 
                SUM(CASE WHEN winner = team1 THEN 1 ELSE 0 END) AS wins,
                COUNT(*) AS total_matches
            FROM matchdf
            WHERE team1 = '{team_name}'
            UNION ALL
            SELECT 
                SUM(CASE WHEN winner = team2 THEN 1 ELSE 0 END) AS wins,
                COUNT(*) AS total_matches
            FROM matchdf
            WHERE team2 = '{team_name}'
        ) AS combined
    """,
    "Total Runs Scored": f"""
        SELECT SUM(runs) AS total_runs_scored
        FROM battingdf
        WHERE teamInnings = '{team_name}';
    """,
    "Total Wickets Taken": f"""
        SELECT SUM(wickets) AS total_wickets_taken
        FROM bowlingdf
        WHERE bowlingTeam = '{team_name}';
    """,
    "Total Boundaries": f"""
        SELECT SUM("4s") + SUM("6s") AS Total_boundaries
        FROM battingdf
        WHERE teamInnings = '{team_name}';
    """,
    "AVG Runs Per Innings": f"""
        SELECT ROUND(AVG(total_runs), 2) AS avg_runs_per_innings
        FROM (
            SELECT matchId, teamInnings, SUM(runs) AS total_runs
            FROM battingdf
            WHERE teamInnings = '{team_name}'
            GROUP BY matchId, teamInnings
        ) AS innings_runs;
    """,
    "AVG Economy Rate": f"""
        SELECT ROUND(SUM(runs) * 1.0 / NULLIF(SUM(overs), 0), 2) AS economy_rate
        FROM bowlingdf
        WHERE bowlingTeam = '{team_name}';
    """,
    "Highest Run Scorer": f"""
        SELECT batsmanName,
               SUM(runs) AS total_runs
        FROM battingdf
        WHERE teamInnings = '{team_name}'
        GROUP BY batsmanName
        ORDER BY total_runs DESC
        LIMIT 1;
    """,
    "Highest Wicket Taker": f"""
        SELECT bowlerName,
               SUM(wickets) AS total_wickets
        FROM bowlingdf
        WHERE bowlingTeam = '{team_name}'
        GROUP BY bowlerName
        ORDER BY total_wickets DESC
        LIMIT 1;
    """
}

results = {}
for kpi_name, query in queries.items():
    df = mysql(query)
    if kpi_name in ["Highest Run Scorer", "Highest Wicket Taker"]:
        if not df.empty:
            if kpi_name == "Highest Run Scorer":
                results[kpi_name] = f"{df.iloc[0]['batsmanName']} ({df.iloc[0]['total_runs']})"
            elif kpi_name == "Highest Wicket Taker":
                results[kpi_name] = f"{df.iloc[0]['bowlerName']} ({df.iloc[0]['total_wickets']})"
    else:
        results[kpi_name] = df.iloc[0, 0] if not df.empty else None

team_kpi = pd.DataFrame(list(results.items()), columns=["KPI", "Value"])

team_kpi
Out[20]:
KPI Value
0 Total Matches 6
1 Win/Loss Ratio 66.67
2 Total Runs Scored 966
3 Total Wickets Taken 37
4 Total Boundaries 118
5 AVG Runs Per Innings 161.0
6 AVG Economy Rate 7.64
7 Highest Run Scorer Virat Kohli (296)
8 Highest Wicket Taker Arshdeep Singh (10)

Players KPI's¶

In [21]:
def generate_player_kpis(player_name):
    queries = {
        "Batter": {
            "Total Runs Scored": """
                SELECT SUM(b.runs) AS total_runs_scored 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Batting Average": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN b.`out/not_out` = 1 THEN b.matchId END), 0) AS batting_average 
                FROM battingdf b
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Average Balls Faced": """
                SELECT CAST(SUM(b.balls) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0) AS avg_balls_faced 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "AVG StrikeRate":"""
             SELECT AVG(SR) AS average_strike_rate
             FROM battingdf
             WHERE batsmanName='{player_name}'
             """,
            "Boundary Percentage": """
                SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(SUM(b.balls), 0)) * 100 AS boundary_percentage 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Boundaries Hit Per Match": """
                SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0)) AS boundaries_per_match 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Boundary Frequency (balls per boundary)": """
                SELECT (SUM(b.balls) / NULLIF(SUM(b.`4s` + b.`6s`), 0)) AS boundary_frequency 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """
        },
        "Bowler": {
            "Total Wickets Taken": """
                SELECT SUM(b.wickets) AS total_wickets_taken 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Bowling Average": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_average 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Economy Rate": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.total_balls_bowled) / 6, 0) AS economy_rate 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Bowling Strike Rate": """
                SELECT CAST(SUM(b.total_balls_bowled) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_strike_rate 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Dot Balls Bowled Percentage": """
                SELECT (CAST(SUM(b.`0s`) AS FLOAT) / CAST(SUM(b.total_balls_bowled) AS FLOAT)) * 100 AS dot_balls_bowled_percentage 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """
        },
        "Allrounder": {
            "Total Runs Scored": """
                SELECT SUM(b.runs) AS total_runs_scored 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Batting Average": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN b.`out/not_out` = 1 THEN b.matchId END), 0) AS batting_average 
                FROM battingdf b
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Average Balls Faced": """
                SELECT CAST(SUM(b.balls) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0) AS avg_balls_faced 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Boundary Percentage": """
                SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(SUM(b.balls), 0)) * 100 AS boundary_percentage 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Boundaries Hit Per Match": """
                SELECT (CAST(SUM(b.`4s`) + SUM(b.`6s`) AS FLOAT) / NULLIF(COUNT(DISTINCT b.matchId), 0)) AS boundaries_per_match 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Boundary Frequency (balls per boundary)": """
                SELECT (SUM(b.balls) / NULLIF(SUM(b.`4s` + b.`6s`), 0)) AS boundary_frequency 
                FROM battingdf b 
                JOIN playersdf p ON b.batsmanName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Total Wickets Taken": """
                SELECT SUM(b.wickets) AS total_wickets_taken 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Bowling Average": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_average 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Economy Rate": """
                SELECT CAST(SUM(b.runs) AS FLOAT) / NULLIF(SUM(b.total_balls_bowled) / 6, 0) AS economy_rate 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Bowling Strike Rate": """
                SELECT CAST(SUM(b.total_balls_bowled) AS FLOAT) / NULLIF(SUM(b.wickets), 0) AS bowling_strike_rate 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """,
            "Dot Balls Bowled Percentage": """
                SELECT (CAST(SUM(b.`0s`) AS FLOAT) / CAST(SUM(b.total_balls_bowled) AS FLOAT)) * 100 AS dot_balls_bowled_percentage 
                FROM bowlingdf b 
                JOIN playersdf p ON b.bowlerName = p.name 
                WHERE p.name = '{player_name}'
                GROUP BY p.name;
            """
        }
    }

    role_query = f"""
        SELECT playingRole AS playingRole 
        FROM playersdf 
        WHERE name='{player_name}'
    """.format(player_name=player_name)
    
    role_df = mysql(role_query)
    
    if role_df.empty:
        print(f"No role found for player: {player_name}")
        return []

    role = role_df.iloc[0]['playingRole']
    
    print(f"Role retrieved for {player_name}: {role}")

    if role in ['Top Order Batter', 'Middle Order Batter', 'Wicketkeeper Batter', 'Opening Batter', 'Batter']:
        kpi_queries = queries["Batter"]
    elif role in ['Bowler']:
        kpi_queries = queries["Bowler"]
    elif role in ['Allrounder', 'Batting Allrounder', 'Bowling Allrounder']:
        kpi_queries = queries["Allrounder"]
    else:
        print(f"Unrecognized role: {role}")
        return []
    
    kpi_data = []
    for kpi_name, query in kpi_queries.items():
        formatted_query = query.format(player_name=player_name)
        
        kpi_df = mysql(formatted_query)
        if not kpi_df.empty:
            kpi_value = kpi_df.iloc[0].values[0]
            kpi_value = round(kpi_value, 2)
            kpi_data.append({"KPI": kpi_name, "Value": kpi_value})
        else:
            print(f"No KPI data found for {kpi_name} for player: {player_name}")
    kpi_df = pd.DataFrame(kpi_data)
    return kpi_df
player_name = 'Sam Curran'
kpi_df = generate_player_kpis(player_name)
kpi_df
Role retrieved for Sam Curran: Allrounder
Out[21]:
KPI Value
0 Total Runs Scored 12.00
1 Batting Average 12.00
2 Average Balls Faced 7.00
3 Boundary Percentage 7.14
4 Boundaries Hit Per Match 0.50
5 Boundary Frequency (balls per boundary) 14.00
6 Total Wickets Taken 13.00
7 Bowling Average 11.38
8 Economy Rate 6.73
9 Bowling Strike Rate 10.38
10 Dot Balls Bowled Percentage 48.89

Visualization¶

In [22]:
import plotly.express as px
import plotly.graph_objects as go


team_colors = {
    'Afghanistan': 'green', 
    'Australia': 'yellow', 
    'Bangladesh': '#006400',  
    'England': '#00247D', 
    'India': '#0099FF',  
    'Ireland': '#009B77',  
    'Namibia': '#0033A0',  
    'Netherlands': 'darkOrange',  
    'New Zealand': '#000000',  
    'Pakistan': '#004B49',  
    'Scotland': '#0033A0',  
    'South Africa': '#008C8C',  
    'Sri Lanka': '#FFD700',  
    'U.A.E.': 'chocolate',  
    'West Indies': '#8A2C2D',  
    'Zimbabwe': 'tomato'  
}



def h_bar_plot(df, x_column, y_column, title, x_title, y_title):
    fig = px.bar(df, x=x_column, y=y_column, color=y_column, text=df[x_column],
                 title=title, labels={x_column: x_title, y_column: y_title},
                 color_discrete_map=team_colors, orientation='h')
    
    fig.update_traces(marker_line_width=1.5, opacity=0.8, 
                      textposition='outside')
    
    fig.update_layout(title_font_size=24, showlegend=True)
    
    fig.show()



def bar_plot(df, x_column, y_column, title, x_title, y_title):
    
    fig = px.bar(df, x=x_column, y=y_column, color=x_column,text=df[y_column],
                 title=title, labels={x_column: x_title, y_column: y_title},
                 color_discrete_map=team_colors)
    
   
    fig.update_traces(marker_line_width=1.5, opacity=0.8,  textposition='outside')
    fig.update_layout(title_font_size=24,showlegend=True)  
    fig.show()
    





def create_scatter_plot(df, x_column, y_column, title, x_title, y_title, size_column=None):
    
    if size_column:
        fig = px.scatter(
            df, 
            x=x_column, 
            y=y_column, 
            color=x_column,  
            size=size_column,  
            title=title, 
            labels={x_column: x_title, y_column: y_title},
            color_discrete_map=team_colors  
        )
    else:
        fig = go.Figure()
        for team, color in team_colors.items():
            team_data = df[df['team'] == team]
            if not team_data.empty:
                
                hover_text = team_data.apply(lambda row: f'Team: {team}<br>{x_title}: {row[x_column]:.2f}<br>{y_title}: {row[y_column]:.2f}', axis=1)
                
                fig.add_trace(go.Scatter(
                    x=team_data[x_column],
                    y=team_data[y_column],
                    mode='markers',
                    marker=dict(
                        size=20,  
                        color=color,
                        opacity=1,
                        line=dict(width=2, color=color)
                    ),
                    hovertext=hover_text,  
                    hoverinfo='text',  
                    name=team,
                    showlegend=True  
                ))
    
    fig.update_layout(title=title, title_font_size=24, xaxis_title=x_title, yaxis_title=y_title, showlegend=True)
    fig.show()
In [23]:
q="""SELECT b.batsmanName as BatsmanName,MAX(b.runs) AS "Top Individual Scores",
    m.matchId as MatchId,DATE(m.matchDate)as MatchDate,
    m.ground as Ground,
    CASE 
        WHEN teamInnings = m.team1 THEN m.team2
        WHEN teamInnings= m.team2 THEN m.team1
    END AS VS
FROM battingdf b
JOIN matchdf m ON b.matchId = m.matchId
GROUP BY b.batsmanName, m.matchId, m.matchDate, m.ground, VS
ORDER BY "Top individual scores" DESC
LIMIT 10;

"""
q1=mysql(q)
q1
Out[23]:
BatsmanName Top Individual Scores MatchId MatchDate Ground VS
0 Rilee Rossouw 109 1847 2022-10-27 Sydney Bangladesh
1 Glenn Phillips 104 1850 2022-10-29 Sydney Sri Lanka
2 Devon Conway 92 1839 2022-10-22 Sydney Australia
3 Alex Hales 86 1878 2022-11-10 Adelaide India
4 Michael Jones 86 1833 2022-10-19 Hobart Ireland
5 Sikandar Raza 82 1828 2022-10-17 Hobart Ireland
6 Virat Kohli 82 1842 2022-10-23 Melbourne Pakistan
7 Jos Buttler 80 1878 2022-11-10 Adelaide India
8 Kusal Mendis 79 1835 2022-10-20 Geelong Netherlands
9 Pathum Nissanka 74 1832 2022-10-18 Geelong U.A.E.
In [24]:
q="""SELECT b.bowlerName AS "Bowler Name",
       MAX(b.wickets) AS "Top Individual Wickets",
       b.runs AS "Runs Given",
       m.matchId AS "MatchId",
       DATE(m.matchDate) AS "MatchDate",
       m.ground AS "Ground",
       CASE 
           WHEN b.bowlingTeam = m.team1 THEN m.team2
           WHEN b.bowlingTeam = m.team2 THEN m.team1
           ELSE NULL
       END AS "VS"
FROM bowlingdf b
JOIN matchdf m ON b.matchId = m.matchId
GROUP BY b.bowlerName, m.matchId, DATE(m.matchDate), m.ground, "VS"
ORDER BY "Top Individual Wickets" DESC
LIMIT 10;


"""
q2=mysql(q)
q2
Out[24]:
Bowler Name Top Individual Wickets Runs Given MatchId MatchDate Ground VS
0 Sam Curran 5 10 1840 2022-10-22 Perth Afghanistan
1 Alzarri Joseph 4 16 1834 2022-10-19 Hobart Zimbabwe
2 Anrich Nortje 4 10 1847 2022-10-27 Sydney Bangladesh
3 Anrich Nortje 4 41 1861 2022-11-03 Sydney Pakistan
4 Lungi Ngidi 4 29 1853 2022-10-30 Perth India
5 Mohammad Wasim 4 24 1849 2022-10-27 Perth Zimbabwe
6 Shaheen Shah Afridi 4 22 1872 2022-11-06 Adelaide Bangladesh
7 Taskin Ahmed 4 25 1843 2022-10-24 Hobart Netherlands
8 Trent Boult 4 13 1850 2022-10-29 Sydney Sri Lanka
9 Arshdeep Singh 3 32 1842 2022-10-23 Melbourne Pakistan
In [25]:
q = """
SELECT
    CASE 
        WHEN margin LIKE '%runs%' THEN 'Batting First(Defending)'
        WHEN margin LIKE '%wickets%' THEN 'Bowling First(Chasing)'
    END AS winning_method,
    COUNT(*) AS win_count
FROM matchdf
WHERE margin LIKE '%runs%' OR margin LIKE '%wickets%'
GROUP BY winning_method;
"""
q3 = mysql(q)

fig = px.pie(
    q3,
    names='winning_method',
    values='win_count',
    title='Winning Strategies: Batting First vs Bowling First',
    color_discrete_sequence=['skyblue', 'lightgreen']
)

fig.update_layout(title_font_size=24)  
fig.show()
In [26]:
q = """
SELECT winner AS team,
       COUNT(*) AS win_count
FROM matchdf
WHERE winner IS NOT NULL AND winner != 'No Result'
GROUP BY team
ORDER BY win_count DESC;
"""

q4 = mysql(q)


h_bar_plot(q4, 'win_count', 'team', 'Number of Wins by Team', 'Number of Wins', 'Team')
In [27]:
q="""WITH BattingCategory AS (
    SELECT
        teamInnings,
        CASE 
            WHEN battingPos IN (1, 2) THEN 'Openers'
            WHEN battingPos IN (3, 4, 5) THEN 'Middle Order'
            WHEN battingPos IN (6, 7) THEN 'Lower Middle Order'
            ELSE 'Lower Order'
        END AS "Batting Category",
        runs,
        balls,
        "4s",
        "6s"
    FROM
        battingdf
),
Metrics AS (
    SELECT
        "Batting Category",
        ROUND(SUM(runs), 2) AS "Total Runs",
        ROUND(AVG(runs), 2) AS "Average Runs",
        ROUND(SUM(balls), 2) AS "Total Balls Faced",
        ROUND((SUM("4s") + SUM("6s")) * 100.0 / SUM(balls), 2) AS "Boundary Percentage"
    FROM BattingCategory
    GROUP BY "Batting Category"
)
SELECT
    "Batting Category",
    "Total Runs",
    "Average Runs",
    ROUND("Total Runs" * 100.0 / NULLIF("Total Balls Faced", 0), 2) AS "AVG Strike Rate",
    "Total Balls Faced",
    "Boundary Percentage"
FROM Metrics
ORDER BY CASE "Batting Category"
        WHEN 'Openers' THEN 1
        WHEN 'Middle Order' THEN 2
        WHEN 'Lower Middle Order' THEN 3
        WHEN 'Lower Order' THEN 4
END;
"""
q5=mysql(q)
q5
create_scatter_plot(q5, 'Batting Category', 
                  'AVG Strike Rate', 
                   size_column='Average Runs',
                   title="Batting Performance by Category",
                   x_title="Batting Category",
                   y_title="Average Strike Rate")
In [28]:
q="""SELECT teamInnings as Team, SUM(runs) AS total_runs
FROM battingdf
GROUP BY Team
ORDER BY total_runs DESC;
"""
q6=mysql(q)
bar_plot(q6, 'Team', 'total_runs', 'Total Runs Scored by Each Team', 'Team', 'Total Runs')
In [29]:
q = """
SELECT teamInnings as Team, ROUND(AVG(total_runs),2) AS "Avg Runs Per Match"
FROM (
    SELECT teamInnings, matchId, SUM(runs) AS total_runs
    FROM battingdf
    GROUP BY teamInnings, matchId
) AS match_runs
GROUP BY Team
ORDER BY "Avg Runs Per Match" DESC;
"""

q7 = mysql(q)

bar_plot(q7, 'Team', 'Avg Runs Per Match', 'AVG Runs Scored Per Match by Each Team', 'Team', 'AVG Runs')
In [30]:
q= """SELECT team,ROUND(SUM(wins) * 1.0 / SUM(total_matches) * 100 ,2) AS "Win Loss Ratio"
      FROM(
      SELECT team1 AS team,
                   SUM(CASE WHEN winner = team1 THEN 1 ELSE 0 END) AS wins,
                   COUNT(*) AS total_matches
            FROM matchdf
            GROUP BY team1
            UNION ALL
            SELECT team2 AS team,
                   SUM(CASE WHEN winner = team2 THEN 1 ELSE 0 END) AS wins,
                   COUNT(*) AS total_matches
            FROM matchdf
            GROUP BY team2
        ) AS combined
        GROUP BY team
        ORDER BY "Win Loss Ratio" DESC;
    """

q8=mysql(q)

h_bar_plot(q8, 'Win Loss Ratio', 'team', 'Wins Loss Ratio of Each Team(Percentage)', 'Win/Loss Ratio ', 'Team')
In [31]:
q= """
SELECT bowlingTeam AS team,
SUM(wickets) AS "Total Wickets"
FROM bowlingdf
GROUP BY bowlingTeam
ORDER BY "Total Wickets" DESC;
"""
q9=mysql(q)
h_bar_plot(q9, 'Total Wickets', 'team', 'Total Wickets Taken by Each Team', 'Total Wickets ', 'Team')
In [32]:
q= """SELECT bowlingTeam AS Team,
      ROUND(SUM(runs) * 1.0 / SUM(overs),2) AS "Economy Rate"
      FROM bowlingdf
      GROUP BY Team
      ORDER BY  "Economy Rate" DESC;
"""
q10=mysql(q)
bar_plot(q10, 'Team', 'Economy Rate', 'AVG Economy Rate by Each Team', 'Team', 'Economy Rate')
In [33]:
q="""
SELECT teamInnings AS Team,
ROUND((SUM(runs) * 1.0 / SUM(balls)) * 100,2) AS "Strike Rate"
FROM battingdf
GROUP BY Team
ORDER BY  "Strike Rate" DESC;
"""
q11=mysql(q)
bar_plot(q11, 'Team', 'Strike Rate', 'AVG Batting Strike Rate by Each Team', 'Team', 'Batting Strike Rate')
In [34]:
q= """
SELECT bowlingTeam AS Team,
ROUND((SUM(total_balls_bowled) * 1.0 / SUM(wickets)),2) AS "Bowling Strike Rate"
FROM bowlingdf
GROUP BY Team
ORDER BY "Bowling Strike Rate" DESC;
"""
q12=mysql(q)
bar_plot(q12, 'Team','Bowling Strike Rate', 'AVG Bowling Strike Rate by Each Team', 'Team', 'Bowling Strike Rate')
In [35]:
q="""WITH BattingDetails AS (
    SELECT
        b.teamInnings AS batting_team,
        bs.matchId,
        bs."0s",
        bs.total_balls_bowled,
        bs.wides,
        bs.noBalls
    FROM
        battingdf b
    JOIN
        bowlingdf bs
    ON
        b.matchId = bs.matchId
        AND b.teamInnings = CASE 
            WHEN bs.bowlingTeam = m.team1 THEN m.team2
            WHEN bs.bowlingTeam = m.team2 THEN m.team1
        END
    JOIN
        matchdf m
    ON
        bs.matchId = m.matchId
)
SELECT 
    b.batting_team as Team,
    SUM(b."0s") AS total_dot_balls_faced,
    SUM(b.total_balls_bowled - b.wides - b.noBalls) AS total_balls_faced,
    ROUND((SUM(b."0s") * 100.0) / SUM(b.total_balls_bowled - b.wides - b.noBalls),2) AS "Dot Ball Percentage"
FROM BattingDetails b
GROUP BY Team
ORDER BY "Dot Ball Percentage" DESC;


    """
q13=mysql(q)
bar_plot(q13, 'Team','Dot Ball Percentage', 'Dot Ball Percentage by Each Team(Batting)', 'Team', 'Dot Ball Percentage ')
In [36]:
q="""
SELECT bowlingTeam AS Team,
SUM(extras) / COUNT(DISTINCT matchId) AS "Avg Extras Per Match"
FROM bowlingdf
GROUP BY bowlingTeam
ORDER BY "Avg Extras Per Match" DESC;
"""
q14=mysql(q)
bar_plot(q14, 'Team','Avg Extras Per Match', 'Avg Extras Conceded Per Match by Each Team', 'Team', 'Extras Conceded')
In [37]:
q="""SELECT
    teamInnings AS team,
    ROUND(AVG(runs),2) AS "Average Runs",
    SUM(balls) AS total_balls_faced,
    (SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (1, 2) 
GROUP BY team;

"""
q15=mysql(q)

create_scatter_plot(q15, 'Average Runs', 'strike_rate', 'Openers AVG Runs vs Strike Rate by Team(Batting Position 1&2)', 'Average Runs', 'Strike Rate')



    
In [38]:
q="""SELECT
    teamInnings AS team,
    AVG(runs) AS "Average Runs",
    SUM(balls) AS total_balls_faced,
    (SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (3,4,5) 
GROUP BY team;

"""
q16=mysql(q)

create_scatter_plot(q16, 'Average Runs', 'strike_rate', 'Middle Order AVG Runs vs Strike Rate by Team(Batting Position 3,4,5)', 'Average Runs', 'Strike Rate')
In [39]:
q="""SELECT
    teamInnings AS team,
    AVG(runs) AS "Average Runs",
    SUM(balls) AS total_balls_faced,
    (SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)) AS strike_rate
FROM battingdf
WHERE battingPos IN (6,7) 
GROUP BY team;
"""
q17=mysql(q)
create_scatter_plot(q17, 'Average Runs', 'strike_rate', 'Lower Middle Order AVG Runs vs Strike Rate by Team(Batting Position 6,7)', 'Average Runs', 'Strike Rate')
In [40]:
q="""SELECT batsmanName as PlayerName,
    ROUND(CAST(SUM(runs) AS FLOAT) / NULLIF(COUNT(DISTINCT CASE WHEN `out/not_out` = 1 THEN matchId END), 0) ,2)AS "Average Runs",
    SUM(balls) AS total_balls_faced,
    ROUND((SUM(runs) * 100.0 / NULLIF(SUM(balls), 0)),2)AS strike_rate
FROM battingdf
GROUP BY PlayerName
HAVING SUM(balls) >= 75 
ORDER BY "Average Runs" DESC
LIMIT 10;

"""
q18=mysql(q)
fig = go.Figure()

fig = px.scatter(q18, 
                 x='Average Runs', 
                 y='strike_rate',  
                 size='Average Runs',    
                 color='PlayerName',  
                 title='Top 10 Batters: AVG Runs vs AVG SR(Minimum 75 balls Faced)',
                 labels={'Average Runs': 'Average Runs ', 'strike_rate': 'Strike Rate'})

fig.update_traces( marker=dict( opacity=1))  
fig.update_layout(title_font_size=24,showlegend=True)  

fig.update_layout(
    xaxis_title='Average Runs',
    yaxis_title='Strike Rate')
In [41]:
q="""SELECT
    BowlerName AS PlayerName,
    SUM(wickets) AS total_wickets,
    SUM(runs) AS total_runs_conceded,
    SUM(total_balls_bowled) AS total_balls_bowled,
    ROUND(CAST(SUM(runs) AS FLOAT) / NULLIF(SUM(total_balls_bowled)/ 6, 0),2) AS economy_rate
FROM bowlingdf
GROUP BY PlayerName
HAVING SUM(total_balls_bowled) >= 50 
ORDER BY total_wickets DESC
LIMIT 10;

"""
kpi=mysql(q)


fig = px.scatter(kpi, 
                 x='total_wickets', 
                 y='economy_rate',  
                 size='total_wickets',    
                 color='PlayerName',      
                 title='Top 10 Bowlers: Total Wickets vs Economy Rate',
                 labels={'total_wickets': 'Total Wickets', 'economy_rate': 'Economy Rate'})

fig.update_traces( marker=dict( opacity=1))  
fig.update_layout(title_font_size=24,showlegend=True)  


fig.update_layout(
    xaxis_title='Total Wickets',
    yaxis_title='Economy Rate')


fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: